# Load ZTrax data on California home transactions (ZTrans)

pacman::p_load(data.table, tidyverse, readxl, fst)

rm(list = ls())

source("code/globals.R")

vintage_dir <- file.path(INPUT_PRIVATE, "ZTRAX", "transactions", "20210802")

# Load the layout of Ztrans
layoutZTrans <- read_excel(file.path(vintage_dir, "Layout.xlsx"),
  sheet = 2,
  col_types = c("text", "text", "numeric", "text", "text")
)

col_namesProp <- layoutZTrans[layoutZTrans$TableName == "utPropertyInfo", "FieldName"]

# Load property info ----

# Create a data.frame of desired variables from the property info table
vars_prop <- layoutZTrans %>% 
  filter(TableName == "utPropertyInfo",
         FieldName %in% c("TransId", "PropertySequenceNumber", "LoadID", "ImportParcelID"))

propTrans <- fread(file.path(vintage_dir, "06/ZTrans/PropertyInfo.txt"),
                   sep = "|",
                   header = FALSE,
                   stringsAsFactors = FALSE,
                   quote = "",
                   select = vars_prop$column_id,
                   col.names = vars_prop$FieldName,
                   encoding = "Latin-1"
)

setkey(propTrans, TransId, PropertySequenceNumber, LoadID)

# Keep only one record for each TransID and PropertySequenceNumber.
# TransID is the unique identifier of a transaction, which could have multiple properties sequenced by PropertySequenceNumber.
# Multiple entries for the same TransID and PropertySequenceNumber are due to updated records.
# The most recent record is identified by the greatest LoadID.
propTrans <- unique(propTrans, by = c("TransId", "PropertySequenceNumber"))
propTrans[, LoadID := NULL]

propTrans[, multiple_parcels := any(PropertySequenceNumber > 1), by = TransId]
propTrans <- propTrans[multiple_parcels == FALSE]

write_fst(propTrans, file.path(WORKING, "ztraxdata/ZTrans/propTrans.fst"))

rm(propTrans)

# Load main transactions table ----

# Get table of desired variables
vars_main <- layoutZTrans %>%
  filter(TableName == "utMain",
         FieldName %in% c("TransId", "FIPS", "LoadID",
                          "RecordingDate", "DocumentDate", "SignatureDate", "EffectiveDate",
                          "SalesPriceAmount", "LoanAmount",
                          "SalesPriceAmountStndCode", "LoanAmountStndCode",
                          "LoanTypeStndCode",
                          "DataClassStndCode", "DocumentTypeStndCode",
                          "PartialInterestTransferStndCode", "IntraFamilyTransferFlag", "TransferTaxExemptFlag",
                          "PropertyUseStndCode", "AssessmentLandUseStndCode",
                          "OccupancyStatusStndCode"))


trans <- fread(file.path(vintage_dir, "06/ZTrans/Main.txt"),
               sep = "|",
               header = FALSE,
               stringsAsFactors = FALSE,
               quote = "",
               select = vars_main$column_id,
               col.names = vars_main$FieldName,
               encoding = "Latin-1"
)



# Keep only one record for each TransID.
# TransID is the unique identifier of a transaction.
# Multiple entries for the same TransID are due to updated records.
# The most recent record is identified by the greatest LoadID.

setkey(trans, TransId, LoadID)
trans <- unique(trans, by = c("TransId"))
trans[, LoadID := NULL]

write_fst(trans, file.path(WORKING, "ztraxdata/ZTrans/trans.fst"))

